1
|
|
|
/* eslint-disable no-param-reassign */ |
2
|
|
|
import { start, end, indexToPosition, toHash, maxLettersSymbol } from '../src/utils/hashUtils'; |
3
|
|
|
|
4
|
|
|
const endHash = toHash(end); |
5
|
|
|
const startHash = toHash(start); |
6
|
|
|
|
7
|
|
|
export default class ModelBuilder { |
8
|
|
|
collectHashes(letter, sanitized, index) { |
9
|
|
|
if (letter === null) return [ startHash ]; |
|
|
|
|
10
|
|
|
|
11
|
|
|
const hashes = []; |
12
|
|
|
const symbols = []; |
13
|
|
|
|
14
|
|
|
for (let limit = 0; limit < maxLettersSymbol; limit++) { |
15
|
|
|
const sliced = sanitized.slice(index - limit, index + 1).join(''); |
16
|
|
|
|
17
|
|
|
if (sliced.length > 0) symbols.push(sliced); |
|
|
|
|
18
|
|
|
} |
19
|
|
|
|
20
|
|
|
for (const symbol of symbols) { |
21
|
|
|
hashes.push( |
22
|
|
|
symbol, |
23
|
|
|
toHash({ |
24
|
|
|
symbol, |
25
|
|
|
position : indexToPosition(index) |
26
|
|
|
}) |
27
|
|
|
); |
28
|
|
|
} |
29
|
|
|
|
30
|
|
|
return hashes; |
31
|
|
|
} |
32
|
|
|
|
33
|
|
|
sanitize(word, caseType) { |
34
|
|
|
if (caseType === 'LOWER') return word.toLowerCase(); |
|
|
|
|
35
|
|
|
if (caseType === 'UPPER') return word.toUpperCase(); |
|
|
|
|
36
|
|
|
|
37
|
|
|
return word; |
38
|
|
|
} |
39
|
|
|
|
40
|
|
|
getTokens(word, tokens) { |
41
|
|
|
return this.sanitize(word).split(tokens.delim); |
42
|
|
|
} |
43
|
|
|
|
44
|
|
|
buildMarkov(dataset, model, tokenConfig) { |
45
|
|
|
for (const item of dataset) { |
46
|
|
|
const { word, weight } = item; |
47
|
|
|
const sanitized = this.getTokens(word, tokenConfig); |
48
|
|
|
const array = [ null, ...sanitized ]; |
49
|
|
|
|
50
|
|
|
for (let index = 0; index < array.length; index++) { |
51
|
|
|
const letter = array[index]; |
52
|
|
|
const hashes = this.collectHashes(letter, sanitized, index - 1); |
53
|
|
|
|
54
|
|
|
for (const hash of hashes) { |
55
|
|
|
let map = model[hash]; |
56
|
|
|
|
57
|
|
|
if (!map) { |
58
|
|
|
map = {}; |
59
|
|
|
model[hash] = map; |
60
|
|
|
} |
61
|
|
|
|
62
|
|
|
const nextSymbol = array[index + 1] || endHash; |
63
|
|
|
const old = map[nextSymbol] || 0; |
64
|
|
|
|
65
|
|
|
map[nextSymbol] = old + Math.log(1 + weight); |
66
|
|
|
} |
67
|
|
|
} |
68
|
|
|
} |
69
|
|
|
|
70
|
|
|
return model; |
71
|
|
|
} |
72
|
|
|
|
73
|
|
|
normalize(model) { |
74
|
|
|
Object.keys(model).forEach(hash => { |
75
|
|
|
const map = model[hash]; |
76
|
|
|
const keys = Object.keys(map); |
77
|
|
|
|
78
|
|
|
if (keys.length <= 1) delete model[hash]; |
|
|
|
|
79
|
|
|
|
80
|
|
|
const sum = Object.values(map).reduce((a, b) => a + b, 0); |
81
|
|
|
|
82
|
|
|
keys.forEach(key => map[key] = map[key] / sum); |
83
|
|
|
}); |
84
|
|
|
|
85
|
|
|
return model; |
86
|
|
|
} |
87
|
|
|
|
88
|
|
|
buildStatic(dataset, model, tokenConfig) { |
89
|
|
|
for (const item of dataset) { |
90
|
|
|
const { word, weight } = item; |
91
|
|
|
const sanitized = this.sanitize(word, tokenConfig); |
92
|
|
|
const old = model[sanitized] || 0; |
93
|
|
|
|
94
|
|
|
model[sanitized] = old + Math.log(1 + weight); |
95
|
|
|
} |
96
|
|
|
|
97
|
|
|
const keys = Object.keys(model); |
98
|
|
|
const sum = Object.values(model).reduce((a, b) => a + b, 0); |
99
|
|
|
|
100
|
|
|
keys.forEach(key => model[key] = model[key] / sum); |
101
|
|
|
|
102
|
|
|
return model; |
103
|
|
|
} |
104
|
|
|
} |
105
|
|
|
|
Consider adding curly braces around all statements when they are executed conditionally. This is optional if there is only one statement, but leaving them out can lead to unexpected behaviour if another statement is added later.
Consider:
If you or someone else later decides to put another statement in, only the first statement will be executed.
In this case the statement
b = 42
will always be executed, while the logging statement will be executed conditionally.ensures that the proper code will be executed conditionally no matter how many statements are added or removed.